{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from gensim.corpora import WikiCorpus\n",
    "from gensim import utils"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--2020-02-05 22:58:24--  https://dumps.wikimedia.org/idwiki/latest/idwiki-latest-pages-articles.xml.bz2\n",
      "Resolving dumps.wikimedia.org (dumps.wikimedia.org)... 208.80.154.7, 2620:0:861:1:208:80:154:7\n",
      "Connecting to dumps.wikimedia.org (dumps.wikimedia.org)|208.80.154.7|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 569866060 (543M) [application/octet-stream]\n",
      "Saving to: ‘idwiki-latest-pages-articles.xml.bz2’\n",
      "\n",
      "idwiki-latest-pages  85%[================>   ] 467.31M  28.4KB/s    in 41m 24s \n",
      "\n",
      "2020-02-05 23:39:50 (193 KB/s) - Connection closed at byte 490012334. Retrying.\n",
      "\n",
      "--2020-02-05 23:39:51--  (try: 2)  https://dumps.wikimedia.org/idwiki/latest/idwiki-latest-pages-articles.xml.bz2\n",
      "Connecting to dumps.wikimedia.org (dumps.wikimedia.org)|208.80.154.7|:443... connected.\n",
      "HTTP request sent, awaiting response... 206 Partial Content\n",
      "Length: 569866060 (543M), 79853726 (76M) remaining [application/octet-stream]\n",
      "Saving to: ‘idwiki-latest-pages-articles.xml.bz2’\n",
      "\n",
      "idwiki-latest-pages 100%[+++++++++++++++++==>] 543.47M   427KB/s    in 11m 5s  \n",
      "\n",
      "2020-02-05 23:50:59 (117 KB/s) - ‘idwiki-latest-pages-articles.xml.bz2’ saved [569866060/569866060]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "!wget https://dumps.wikimedia.org/idwiki/latest/idwiki-latest-pages-articles.xml.bz2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def tokenize(content, token_min_len = 2, token_max_len = 15, lower = True):\n",
    "    # override original method in wikicorpus.py\n",
    "    return [\n",
    "        utils.to_unicode(token)\n",
    "        for token in content.split()\n",
    "        if token_min_len <= len(token) <= token_max_len\n",
    "        and not token.startswith('_')\n",
    "    ]\n",
    "\n",
    "\n",
    "def make_corpus(in_f, out_f):\n",
    "\n",
    "    \"\"\"Convert Wikipedia xml dump file to text corpus\"\"\"\n",
    "\n",
    "    output = open(out_f, 'w')\n",
    "    wiki = WikiCorpus(in_f, tokenizer_func = tokenize)\n",
    "\n",
    "    i = 0\n",
    "    for text in wiki.get_texts():\n",
    "        output.write(bytes(' '.join(text), 'utf-8').decode('utf-8') + '\\n')\n",
    "        i = i + 1\n",
    "        if i % 10000 == 0:\n",
    "            print('Processed ' + str(i) + ' articles')\n",
    "    output.close()\n",
    "    print('Processing complete!')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processed 10000 articles\n",
      "Processed 20000 articles\n",
      "Processed 30000 articles\n",
      "Processed 40000 articles\n",
      "Processed 50000 articles\n",
      "Processed 60000 articles\n",
      "Processed 70000 articles\n",
      "Processed 80000 articles\n",
      "Processed 90000 articles\n",
      "Processed 100000 articles\n",
      "Processed 110000 articles\n",
      "Processed 120000 articles\n",
      "Processed 130000 articles\n",
      "Processed 140000 articles\n",
      "Processed 150000 articles\n",
      "Processed 160000 articles\n",
      "Processed 170000 articles\n",
      "Processed 180000 articles\n",
      "Processed 190000 articles\n",
      "Processed 200000 articles\n",
      "Processed 210000 articles\n",
      "Processed 220000 articles\n",
      "Processed 230000 articles\n",
      "Processed 240000 articles\n",
      "Processed 250000 articles\n",
      "Processed 260000 articles\n",
      "Processed 270000 articles\n",
      "Processed 280000 articles\n",
      "Processed 290000 articles\n",
      "Processed 300000 articles\n",
      "Processed 310000 articles\n",
      "Processed 320000 articles\n",
      "Processed 330000 articles\n",
      "Processed 340000 articles\n",
      "Processed 350000 articles\n",
      "Processed 360000 articles\n",
      "Processed 370000 articles\n",
      "Processed 380000 articles\n",
      "Processed 390000 articles\n",
      "Processing complete!\n"
     ]
    }
   ],
   "source": [
    "make_corpus('idwiki-latest-pages-articles.xml.bz2', 'id-wiki.txt')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  },
  "vscode": {
   "interpreter": {
    "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
